suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities
wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/')
figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Chrs/')
read_methylated_position_tsv <- function(path) {
read_tsv(
path, col_names = c('position', 'kmer'),
show_col_types = F
) |>
separate(position, into = c('transcript_id', 'position'), sep = '[|]')
}
theme_set(
theme_classic(base_size = 7) +
theme(legend.position = 'bottom')
)
calc_percentage <- function(df) {
df |>
reframe(n = n()) |>
mutate(percentage = 100 * n / sum(n)) |>
arrange(-percentage)
}
donutplot_genetype <- function(df) {
df |>
add_yrange() |>
ggplot(aes(
xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
fill = genetype2, colour = genetype2
)) +
geom_rect() +
coord_polar(theta = 'y') +
ggrepel::geom_text_repel(
aes(label = genetype2, y = (ymin + ymax) / 2), x = 1
) +
xlim(c(-1,4)) +
scale_fill_manual(values = c('#0099ff', '#ff9900', '#ff0099', '#9900ff')) +
scale_color_manual(values = c('#0099ff', '#ff9900', '#ff0099', '#9900ff')) +
theme_void()
}
#’ # Read methylated position information and add annotation
espresso_annotation <-
read_tsv(paste0(wd, 'Tables/Espresso_AsPC1_annotation_cleaned_2024-03-29.tsv'))
## Rows: 36717 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): seqname, source, feature, score, strand, frame, gene_id, transcrip...
## dbl (2): start, end
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_annotation
## # A tibble: 36,717 × 14
## seqname source feature start end score strand frame gene_id transcript_id
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 chr3 annot… transc… 3.15e6 3.15e6 . - . ENSG00… ENST00000498…
## 2 chr3 annot… transc… 3.15e6 3.15e6 . - . ENSG00… ENST00000459…
## 3 chr3 annot… transc… 3.15e6 3.18e6 . - . ENSG00… ENST00000231…
## 4 chr3 annot… transc… 3.15e6 3.18e6 . - . ENSG00… ENST00000432…
## 5 chr3 annot… transc… 3.13e6 3.13e6 . + . ENSG00… ENST00000339…
## 6 chr3 annot… transc… 3.15e6 3.16e6 . - . ENSG00… ENST00000488…
## 7 chr3 annot… transc… 3.13e6 3.13e6 . + . ENSG00… ENST00000420…
## 8 chr3 annot… transc… 3.14e6 3.15e6 . + . ENSG00… ENST00000698…
## 9 chr3 annot… transc… 3.17e6 3.18e6 . - . ENSG00… ENST00000450…
## 10 chr3 annot… transc… 3.15e6 3.15e6 . + . ENSG00… ENST00000698…
## # ℹ 36,707 more rows
## # ℹ 4 more variables: gene_type <chr>, gene_name <chr>, transcript_type <chr>,
## # transcript_name <chr>
methylated_positions <-
read_tsv(
paste0(wd, 'Tables/DRS/Positions/intensityup_common_2024-04-10.tsv.gz')
) |>
filter(middle_isC == 'C')
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
methylated_positions
## # A tibble: 489 × 65
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 5 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 6 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 7 ENST00000389680.2 MT-RNR1-201 148 GCCAC 1
## 8 ENST00000389680.2 MT-RNR1-201 153 ACCCC 1
## 9 ENST00000389680.2 MT-RNR1-201 154 CCCCC 1
## 10 ENST00000389680.2 MT-RNR1-201 155 CCCCA 1
## # ℹ 479 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
methylated_positions |>
group_by(grepl('.{2}C.{2}', ref_kmer)) |>
calc_percentage()
## # A tibble: 1 × 3
## `grepl(".{2}C.{2}", ref_kmer)` n percentage
## <lgl> <int> <dbl>
## 1 TRUE 489 100
methylated_positions |>
filter(grepl('.{2}C.{2}', ref_kmer)) |>
group_by(seqname) |>
calc_percentage()
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
methylated_positions |>
filter(!grepl('.{2}C.{2}', ref_kmer)) |>
group_by(seqname) |>
calc_percentage()
## # A tibble: 0 × 3
## # ℹ 3 variables: seqname <chr>, n <int>, percentage <dbl>
methylated_positions |>
group_by(grepl('C', ref_kmer)) |>
calc_percentage()
## # A tibble: 1 × 3
## `grepl("C", ref_kmer)` n percentage
## <lgl> <int> <dbl>
## 1 TRUE 489 100
methylated_positions |>
filter(grepl('C', ref_kmer)) |>
group_by(seqname) |>
calc_percentage()
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
methylated_positions_center_C <-
methylated_positions |>
filter(grepl('.{2}C.{2}', ref_kmer))
methylated_positions_center_C
## # A tibble: 489 × 65
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 5 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 6 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 7 ENST00000389680.2 MT-RNR1-201 148 GCCAC 1
## 8 ENST00000389680.2 MT-RNR1-201 153 ACCCC 1
## 9 ENST00000389680.2 MT-RNR1-201 154 CCCCC 1
## 10 ENST00000389680.2 MT-RNR1-201 155 CCCCA 1
## # ℹ 479 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
methylated_positions_C <-
methylated_positions |>
group_by(grepl('C', ref_kmer))
methylated_positions_C
## # A tibble: 489 × 66
## # Groups: grepl("C", ref_kmer) [1]
## transcript_id transcript_name position ref_kmer GMM_logit_pvalue_G
## <chr> <chr> <dbl> <chr> <dbl>
## 1 ENST00000429711.7 RPL32-204 422 GCCCA 1
## 2 ENST00000647248.2 RPL35A-211 380 ACCCC 1
## 3 ENST00000647248.2 RPL35A-211 381 CCCCT 1
## 4 ENST00000389680.2 MT-RNR1-201 57 CCCCG 1
## 5 ENST00000389680.2 MT-RNR1-201 75 ACCCT 0.777
## 6 ENST00000389680.2 MT-RNR1-201 93 ATCAA 1
## 7 ENST00000389680.2 MT-RNR1-201 148 GCCAC 1
## 8 ENST00000389680.2 MT-RNR1-201 153 ACCCC 1
## 9 ENST00000389680.2 MT-RNR1-201 154 CCCCC 1
## 10 ENST00000389680.2 MT-RNR1-201 155 CCCCA 1
## # ℹ 479 more rows
## # ℹ 61 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## # GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## # Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## # c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## # c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## # c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
methylated_positions_groupedby_chr <-
methylated_positions |>
group_by(seqname) |>
calc_percentage()
methylated_positions_groupedby_chr
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
methylated_positions_center_C_groupedby_chr <-
methylated_positions_center_C |>
group_by(seqname) |>
calc_percentage()
methylated_positions_center_C_groupedby_chr
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
methylated_positions_C_groupedby_chr <-
methylated_positions_C |>
group_by(seqname) |>
calc_percentage()
methylated_positions_C_groupedby_chr
## # A tibble: 24 × 3
## seqname n percentage
## <chr> <int> <dbl>
## 1 chrM 229 46.8
## 2 chr12 47 9.61
## 3 chr16 41 8.38
## 4 chr1 34 6.95
## 5 chr11 28 5.73
## 6 chr2 19 3.89
## 7 chr7 16 3.27
## 8 chr19 14 2.86
## 9 chr5 9 1.84
## 10 chr15 6 1.23
## # ℹ 14 more rows
calc_percentage_chrM <- function(df) {
df |>
group_by(seqname == 'chrM') |>
calc_percentage() |>
ungroup() |>
dplyr::rename(isChrM = `seqname == "chrM"`)
}
methylated_positions_groupedby_chrMornot <-
methylated_positions |>
calc_percentage_chrM()
methylated_positions_groupedby_chrMornot
## # A tibble: 2 × 3
## isChrM n percentage
## <lgl> <int> <dbl>
## 1 FALSE 260 53.2
## 2 TRUE 229 46.8
methylated_positions_center_C_groupedby_chrMornot <-
methylated_positions_center_C |>
calc_percentage_chrM()
methylated_positions_center_C_groupedby_chrMornot
## # A tibble: 2 × 3
## isChrM n percentage
## <lgl> <int> <dbl>
## 1 FALSE 260 53.2
## 2 TRUE 229 46.8
methylated_positions_C_groupedby_chrMornot <-
methylated_positions_C |>
calc_percentage_chrM()
methylated_positions_C_groupedby_chrMornot
## # A tibble: 2 × 3
## isChrM n percentage
## <lgl> <int> <dbl>
## 1 FALSE 260 53.2
## 2 TRUE 229 46.8
add_yrange <- function(df) {
new_df <- df |>
mutate(ymax = cumsum(percentage / 100))
new_df$ymin <- c(0, head(new_df$ymax, n = -1))
return(new_df)
}
donutplot_chrM <- function(df) {
df |>
add_yrange() |>
ggplot(aes(
xmin = 2, xmax = 4, ymin = ymin, ymax = ymax,
fill = isChrM, colour = isChrM
)) +
geom_rect() +
coord_polar(theta = 'y') +
ggrepel::geom_text_repel(
aes(label = isChrM, y = (ymin + ymax) / 2), x = 1
) +
xlim(c(-1,4)) +
scale_fill_manual(values = c('blue', 'red')) +
scale_color_manual(values = c('blue', 'red')) +
theme_void()
}
methylated_positions_groupedby_chrMornot_donutplot <-
methylated_positions_groupedby_chrMornot |>
donutplot_chrM()
methylated_positions_groupedby_chrMornot_donutplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir)
methylated_positions_C_groupedby_chrMornot_donutplot <-
methylated_positions_C_groupedby_chrMornot |>
donutplot_chrM()
methylated_positions_C_groupedby_chrMornot_donutplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir)
methylated_positions_center_C_groupedby_chrMornot_donutplot <-
methylated_positions_center_C_groupedby_chrMornot |>
donutplot_chrM()
methylated_positions_center_C_groupedby_chrMornot_donutplot |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir)
methylated_positions_groupedby_chr |>
ggplot(aes(x = reorder(seqname, n), y = n)) +
geom_bar(stat = 'identity') +
coord_flip()
num_detected_transcripts_in_chromosomes <-
espresso_annotation |>
select(seqname, transcript_id) |>
distinct() |>
group_by(seqname) |>
reframe(num_detected_transcripts_in_chr = n()) |>
arrange(-num_detected_transcripts_in_chr)
num_detected_transcripts_in_chromosomes
## # A tibble: 69 × 2
## seqname num_detected_transcripts_in_chr
## <chr> <int>
## 1 chr1 3605
## 2 chr2 2776
## 3 chr11 2422
## 4 chr17 2237
## 5 chr19 2236
## 6 chr7 2146
## 7 chr3 2121
## 8 chr12 2105
## 9 chr16 1906
## 10 chr5 1733
## # ℹ 59 more rows
num_sites_in_transcripts <-
methylated_positions |>
group_by(
seqname, transcript_id, transcript_name, gene_name, gene_type, transcript_type
) |>
reframe(num_sites_in_tr = n()) |>
arrange(-num_sites_in_tr)
num_sites_in_transcripts |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/num_sites_in_transcripts_2024-07-29.tsv
## # A tibble: 71 × 7
## seqname transcript_id transcript_name gene_name gene_type transcript_type
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 chr16 ENST00000343262.9 RPS2-201 RPS2 protein_… protein_coding
## 2 chrM ENST00000361789.2 MT-CYB-201 MT-CYB protein_… protein_coding
## 3 chrM ENST00000389680.2 MT-RNR1-201 MT-RNR1 Mt_rRNA Mt_rRNA
## 4 chrM ENST00000361453.3 MT-ND2-201 MT-ND2 protein_… protein_coding
## 5 chrM ENST00000361381.2 MT-ND4-201 MT-ND4 protein_… protein_coding
## 6 chrM ENST00000361624.2 MT-CO1-201 MT-CO1 protein_… protein_coding
## 7 chr11 ENST00000273550.… FTH1-201 FTH1 protein_… protein_coding
## 8 chr12 ENST00000392514.9 RPLP0-203 RPLP0 protein_… protein_coding
## 9 chrM ENST00000361739.1 MT-CO2-201 MT-CO2 protein_… protein_coding
## 10 chrM ENST00000361390.2 MT-ND1-201 MT-ND1 protein_… protein_coding
## # ℹ 61 more rows
## # ℹ 1 more variable: num_sites_in_tr <int>
num_transcripts_with_m3Csites_groupedby_chr <-
num_sites_in_transcripts |>
group_by(seqname) |>
reframe(n = n()) |>
arrange(-n)
num_transcripts_with_m3Csites_groupedby_chr
## # A tibble: 24 × 2
## seqname n
## <chr> <int>
## 1 chrM 11
## 2 chr12 9
## 3 chr1 7
## 4 chr11 7
## 5 chr5 4
## 6 chr19 3
## 7 chr2 3
## 8 chr7 3
## 9 chr8 3
## 10 chr14 2
## # ℹ 14 more rows
percent_m3CRNAs_in_chr <-
num_transcripts_with_m3Csites_groupedby_chr |>
filter(grepl('chr', seqname)) |>
left_join(num_detected_transcripts_in_chromosomes) |>
mutate(percent_m3CRNAs_in_chr = 100 * n / num_detected_transcripts_in_chr) |>
arrange(-percent_m3CRNAs_in_chr)
## Joining with `by = join_by(seqname)`
percent_m3CRNAs_in_chr |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/percent_m3CRNAs_in_chr_2024-07-29.tsv
## # A tibble: 23 × 4
## seqname n num_detected_transcripts_in_chr percent_m3CRNAs_in_chr
## <chr> <int> <int> <dbl>
## 1 chrM 11 23 47.8
## 2 chr12 9 2105 0.428
## 3 chr11 7 2422 0.289
## 4 chr5 4 1733 0.231
## 5 chrX 2 913 0.219
## 6 chr8 3 1408 0.213
## 7 chr1 7 3605 0.194
## 8 chr18 1 525 0.190
## 9 chr20 2 1054 0.190
## 10 chr13 1 586 0.171
## # ℹ 13 more rows
percent_m3CRNAs_in_chr_barplot <-
percent_m3CRNAs_in_chr |>
ggplot(aes(
x = reorder(seqname, percent_m3CRNAs_in_chr),
y = percent_m3CRNAs_in_chr)) +
geom_bar(stat = 'identity') +
coord_flip() +
labs(x = '', y = '% of transcripts\nwith m3C sites')
percent_m3CRNAs_in_chr_barplot |>
ggsave_multiple_formats(
width = 4, height = 6, fontsize = 7, outdir = figdir)
num_sites_in_transcripts |>
ggplot(aes(x = reorder(seqname, num_sites_in_tr), y = num_sites_in_tr)) +
geom_point() +
coord_flip()
num_transcripts_groupedby_genetype <-
num_sites_in_transcripts |>
mutate(
genetype2 = case_when(
gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
gene_type != 'protein_coding' & seqname == 'chrM' ~ 'mt-rRNA',
is.na(gene_type) ~ 'unannotated gene'
)
) |>
group_by(genetype2) |>
calc_percentage() |>
add_yrange()
num_transcripts_groupedby_genetype |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/num_transcripts_groupedby_genetype_2024-07-29.tsv
## # A tibble: 4 × 5
## genetype2 n percentage ymax ymin
## <chr> <int> <dbl> <dbl> <dbl>
## 1 mRNA 59 83.1 0.831 0
## 2 mt-mRNA 9 12.7 0.958 0.831
## 3 mt-rRNA 2 2.82 0.986 0.958
## 4 unannotated gene 1 1.41 1 0.986
num_transcripts_groupedby_genetype_donut <-
num_transcripts_groupedby_genetype |>
donutplot_genetype()
num_transcripts_groupedby_genetype_donut |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)
num_m3Csites_groupedby_genetype <-
num_sites_in_transcripts |>
mutate(
genetype2 = case_when(
gene_type == 'protein_coding' & seqname == 'chrM' ~ 'mt-mRNA',
gene_type == 'protein_coding' & seqname != 'chrM' ~ 'mRNA',
gene_type != 'protein_coding' & seqname == 'chrM' ~ 'mt-rRNA',
is.na(gene_type) ~ 'unannotated gene'
)
) |>
group_by(genetype2) |>
reframe(num_m3Csite = sum(num_sites_in_tr)) |>
mutate(percentage = 100 * num_m3Csite / sum(num_m3Csite))
num_m3Csites_groupedby_genetype |>
export_tsv(outdir = tabledir)
##
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/num_m3Csites_groupedby_genetype_2024-07-29.tsv
## # A tibble: 4 × 3
## genetype2 num_m3Csite percentage
## <chr> <int> <dbl>
## 1 mRNA 257 52.6
## 2 mt-mRNA 182 37.2
## 3 mt-rRNA 47 9.61
## 4 unannotated gene 3 0.613
num_m3Csites_groupedby_genetype_donut <-
num_m3Csites_groupedby_genetype |>
donutplot_genetype()
num_m3Csites_groupedby_genetype_donut
num_m3Csites_groupedby_genetype_donut |>
ggsave_multiple_formats(
width = 5, height = 5, fontsize = 7, outdir = figdir
)